Package org.terrier.structures

Source Code of org.terrier.structures.CompressingMetaIndex$LoggingDocid2OffsetLookup

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org/
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is CompressingMetaIndex.java
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original contributor)
*/
package org.terrier.structures;

import gnu.trove.TObjectIntHashMap;

import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;

import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import org.terrier.sorting.HeapSortInt;
import org.terrier.structures.collections.FSOrderedMapFile;
import org.terrier.structures.collections.OrderedMap;
import org.terrier.structures.seralization.FixedSizeIntWritableFactory;
import org.terrier.structures.seralization.FixedSizeTextFactory;
import org.terrier.structures.seralization.FixedSizeWriteableFactory;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.Files;
import org.terrier.utility.Wrapper;
import org.terrier.utility.io.HadoopUtility;
import org.terrier.utility.io.RandomDataInput;
import org.terrier.utility.io.RandomDataInputMemory;
import org.terrier.utility.io.WrappedIOException;

/** A {@link MetaIndex} implementation that compresses contents.
* Values have maximum lengths, but overall value blobs are
* compressed using java.util.zip.Inflater.
* @author Craig Macdonald &amp; Vassilis Plachouras
* @since 3.0
*/
@SuppressWarnings("deprecation")
public class CompressingMetaIndex implements MetaIndex {
  /** logger to be used in this class */
  private static Logger logger = Logger.getLogger(CompressingMetaIndex.class);
  /**
   * A Hadoop input format for a compressing meta index (allows the reading of a meta index
   * as input to a MapReduce job.
   */
  public static class CompressingMetaIndexInputFormat implements InputFormat<IntWritable, Wrapper<String[]>>
  {
    static String STRUCTURE_NAME_JC_KEY = "MetaIndexInputStreamRecordReader.structureName";
    /**
     * Set structure
     * @param jc
     * @param metaStructureName
     */
    public static void setStructure(JobConf jc, String metaStructureName)
    {
      jc.set(STRUCTURE_NAME_JC_KEY, metaStructureName);
    }
   
    static class MetaIndexSplit extends FileSplit
    {
      int startId;
      int endId;
     
      public MetaIndexSplit(){
        super(null, (long)0, (long)0, new String[0]);
      }
     
      public MetaIndexSplit(Path file, long start, long length, String[] hosts, int _startId, int _endId) {
        super(file, start, length, hosts);
        startId = _startId;
        endId = _endId;
      }     
     
      public void readFields(DataInput in) throws IOException {
        super.readFields(in);
        startId = in.readInt();
        endId = in.readInt();
      }

      public void write(DataOutput out) throws IOException {
        super.write(out);
        out.writeInt(startId);
        out.writeInt(endId);
      }
     
      public String toString()
      {
        StringBuilder rtr = new StringBuilder();
        rtr.append("MetaIndexSplit: BlockSize=").append(this.getLength());
        rtr.append(" startAt=").append(+this.getStart());
        try{
          rtr.append(" hosts=");
          rtr.append(ArrayUtils.join(this.getLocations(), ","));
        }
        catch (IOException ioe ) {
          //logger.warn("Problem getting locations", ioe);
        }
        rtr.append(" ids=["+startId+","+endId +"]");
        return rtr.toString();
      }
    }   
   
    static class MetaIndexInputStreamRecordReader implements RecordReader<IntWritable, Wrapper<String[]>>
    {
      final InputStream in;
      final int startID;
      final int endID;
     
      public MetaIndexInputStreamRecordReader(Index index, String structureName, int startingDocID, int endingID)
        throws IOException
      {
        in = new InputStream(index, structureName, startingDocID, endingID);
        startID = startingDocID;
        endID = endingID;
      }
     
      public void close() throws IOException {
        in.close();
      }

      public IntWritable createKey() {
        return new IntWritable();
      }

      public Wrapper<String[]> createValue() {
        return new Wrapper<String[]>();
      }

      public long getPos() throws IOException {
        return 0;
      }

      public float getProgress() throws IOException {
        return (float)(in.getIndex() - startID)/(float)(endID - startID);
      }

      public boolean next(IntWritable docid, Wrapper<String[]> values)
          throws IOException
      {
        if (! in.hasNext())
          return false;
        //these methods MUST have this order
        values.setObject(in.next());
        docid.set(in.getIndex());
        return true;
      }
     
    }
   
   
    /**
     * {@inheritDoc}
     */
    public RecordReader<IntWritable, Wrapper<String[]>> getRecordReader(
        InputSplit _split, JobConf jc, Reporter reporter)
        throws IOException
    {
      HadoopUtility.loadTerrierJob(jc);
     
      //load the index
      Index.setIndexLoadingProfileAsRetrieval(false);
      Index index = HadoopUtility.fromHConfiguration(jc);
      if (index == null)
        throw new IOException("Index could not be loaded from JobConf: " + Index.getLastIndexLoadError() );
     
      //determine the structure to work on
      String structureName = jc.get(STRUCTURE_NAME_JC_KEY);
      if (structureName == null)
        throw new IOException("JobConf property "+STRUCTURE_NAME_JC_KEY+" not specified");
     
      //get the split
      MetaIndexSplit s = (MetaIndexSplit)_split;
      return new MetaIndexInputStreamRecordReader(index, structureName, s.startId, s.endId);     
    }
   
    private static String[] getHosts(FileStatus fs, FileSystem f, long start, long len) throws IOException
    {
      BlockLocation[] bs = f.getFileBlockLocations(fs, start, len);
      Set<String> hosts = new HashSet<String>();
      for(BlockLocation b : bs)
      {
        for(String host : b.getHosts())
        {
          hosts.add(host);
        }
      }
      return hosts.toArray(new String[0]);
    }
    /**
     * {@inheritDoc}
     */
    public InputSplit[] getSplits(JobConf jc, int advisedNumberOfSplits)
        throws IOException
    {
      logger.setLevel(Level.DEBUG);
      HadoopUtility.loadTerrierJob(jc);
      List<InputSplit> splits = new ArrayList<InputSplit>(advisedNumberOfSplits);
      Index index = HadoopUtility.fromHConfiguration(jc);
      String structureName = jc.get(STRUCTURE_NAME_JC_KEY);
      final String dataFilename = index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + structureName + ".zdata";
      final String indxFilename = index.getPath() + ApplicationSetup.FILE_SEPARATOR + index.getPrefix() + "." + structureName + ".idx";
      final DataInputStream idx = new DataInputStream(Files.openFileStream(indxFilename));
      FileSystem fSys = FileSystem.get(jc);
      FileStatus fs = fSys.getFileStatus(new Path(dataFilename));
     
      final int entryCount = index.getIntIndexProperty("index."+structureName+".entries", 0);
      long dataFileBlockSize = fs.getBlockSize();
      if (forcedDataFileBlockSize != -1) dataFileBlockSize = forcedDataFileBlockSize;
      logger.debug("Block size for "+ dataFilename + " is " + dataFileBlockSize);
      //logger.debug("FSStatus("+dataFilename+")="+ fs.toString());
      int startingId = 0;
      int currentId = 0;
      long startingBlockLocation = 0;
      long blockSizeSoFar = 0;
      long lastRead = idx.readLong();
      while(++currentId < entryCount)
      {
        lastRead = idx.readLong();
        blockSizeSoFar = lastRead - startingBlockLocation;
        //logger.debug("Offset for docid "+ currentId + " is " + lastRead + " blockSizeSoFar="+blockSizeSoFar + " blockStartsAt="+startingBlockLocation);
        if (blockSizeSoFar > dataFileBlockSize)
        {
          final String[] hosts = getHosts(fs, fSys, startingBlockLocation, blockSizeSoFar);
          MetaIndexSplit s = new MetaIndexSplit(new Path(dataFilename), startingBlockLocation, blockSizeSoFar, hosts, startingId, currentId);
          splits.add(s);
          logger.debug("Got split: "+ s.toString());
         
          blockSizeSoFar = 0;
          startingBlockLocation = lastRead + 1;
          startingId = currentId +1;
        }
      }
      if (startingId < currentId)
      {
        blockSizeSoFar = lastRead - startingBlockLocation;
        final String[] hosts = getHosts(fs, fSys, startingBlockLocation, blockSizeSoFar);
        MetaIndexSplit s = new MetaIndexSplit(new Path(dataFilename), startingBlockLocation, blockSizeSoFar, hosts, startingId, currentId-1);
        logger.debug("Got last split: "+ s);
        splits.add(s);
      }
      idx.close();
      logger.debug("Got "+ splits.size() + " splits when splitting meta index");
      return splits.toArray(new InputSplit[0]);
    }
   
    long forcedDataFileBlockSize = -1;
   
    /** Permit the blocksize to be overridden, useful for testing different code paths */
    public void overrideDataFileBlockSize(long blocksize)
    {
      forcedDataFileBlockSize = blocksize;
    }
    /**
     * Validates the structure based on the job configuration
     */
    public void validateInput(JobConf jc) throws IOException {
      if (jc.get(STRUCTURE_NAME_JC_KEY, null) == null)
        throw new WrappedIOException(new IllegalArgumentException("Key " + STRUCTURE_NAME_JC_KEY +" not specified"));
    }
   
  }
 
  /** thread-local cache of Inflaters to be re-used for decompression */
  protected static final ThreadLocal<Inflater> inflaterCache = new ThreadLocal<Inflater>()
  {
    protected final synchronized Inflater initialValue() {
      return new Inflater();
    }
  };
 
  static interface ByteAccessor extends java.io.Closeable
  {
    byte[] read(long offset, int bytes) throws IOException;
  }
 
  static class RandomDataInputAccessor implements ByteAccessor
  {
    final RandomDataInput dataSource;
    public RandomDataInputAccessor(RandomDataInput rdi)
    {
      this.dataSource = rdi;
    }
   
    public final byte[] read(long offset, int bytes) throws IOException
    {
      byte[] out = new byte[bytes];
      dataSource.seek(offset);
      dataSource.readFully(out);
      return out;
    }
   
    public final void close() throws IOException
    {
      dataSource.close();
    }
  }
 
  static class ChannelByteAccessor implements ByteAccessor
  {
    final RandomAccessFile dataSource;
    final FileChannel dataSourceChannel;
   
    public ChannelByteAccessor(RandomAccessFile ds)
    {
      dataSource = ds;
      dataSourceChannel = dataSource.getChannel();
    }
   
    public final byte[] read(long offset, int bytes) throws IOException
    {
      byte[] out = new byte[bytes];
      dataSourceChannel.read(MappedByteBuffer.wrap(out), offset);
      return out;
    }
   
    public final void close() throws IOException
    {
      dataSourceChannel.close();
      dataSource.close();
    }
  }
 
  static final class LoggingDocid2OffsetLookup implements Docid2OffsetLookup
  {
    final Docid2OffsetLookup parent;
    public LoggingDocid2OffsetLookup(Docid2OffsetLookup _parent)
    {
      this.parent = _parent;
    }
   
    public int getLength(int docid) throws IOException {
      final int length = this.parent.getLength(docid);
      //logger.debug("Lookup of length of meta record for doc "+ docid + " gave length "+ length);
      return length;
    }
   
    public long getOffset(int docid) throws IOException {
      final long offset = this.parent.getOffset(docid);
      //logger.debug("Lookup of offset of meta record for doc "+ docid + " gave offset "+ offset);
      return offset;
    }
   
    public void close() throws IOException {
      parent.close();
    }   
  }
 
  static interface Docid2OffsetLookup extends java.io.Closeable
   
        long getOffset(int docid) throws IOException;
        int getLength(int docid) throws IOException;
    }
 
  static class ArrayDocid2OffsetLookup implements Docid2OffsetLookup
    {
        protected final long[] docid2offsets;
        protected final long fileLength;
        protected final int docidCount;

        public ArrayDocid2OffsetLookup(long[] _docid2offsets, long _fileLength)
        {
            docid2offsets = _docid2offsets;
            fileLength = _fileLength;
            docidCount = docid2offsets.length;
        }

        public final long getOffset(final int docid)
        {
            return docid2offsets[docid];
        }
       
        public final int getLength(final int docid)
        {
            return  (docid+1)==docidCount
              ? (int)(fileLength-docid2offsets[docid])
              : (int)(docid2offsets[docid+1] - docid2offsets[docid]);
        }
       
        public void close()
        {}
    }
 
  static class OnDiskDocid2OffsetLookup implements Docid2OffsetLookup
    {
    private static final int SIZE_OF_LONG = Long.SIZE / 8;
    final ByteAccessor b;
        int lastDocid = -1;
        long lastOffset = -1;
        int lastLength = -1;
       
        protected final long fileLength;
        protected final int docidCount;

        public OnDiskDocid2OffsetLookup(ByteAccessor _b, int _docCount, long _fileLength)
        {
            b=_b;
            docidCount = _docCount;
            fileLength = _fileLength;
        }

        public final long getOffset(final int docid) throws IOException
        {
          readOffset(docid);
          ////logger.info("Offset for docid "+ docid + " is " + lastOffset);
            return lastOffset;
        }

        public final int getLength(final int docid) throws IOException
        {
          readOffset(docid);
          ////logger.info("length for docid "+ docid + " is " + lastLength);
            return lastLength;
        }
       
        protected final void readOffset(int docid) throws IOException
        {
          if (docid == lastDocid)
            return;
          if (docid +1 == docidCount )
          {
            final byte[] readBuffer = b.read((long)docid * SIZE_OF_LONG, SIZE_OF_LONG);
              lastOffset = (((long)readBuffer[0] << 56) +
                         ((long)(readBuffer[1] & 255) << 48) +
                         ((long)(readBuffer[2] & 255) << 40) +
                         ((long)(readBuffer[3] & 255) << 32) +
                         ((long)(readBuffer[4] & 255) << 24) +
                         ((readBuffer[5] & 255) << 16) +
                         ((readBuffer[6] & 255) <<  8) +
                         ((readBuffer[7] & 255) <<  0));
              lastLength = (int)(fileLength - lastOffset);
          }
          else
          {
            final byte[] readBuffer = b.read((long)docid * SIZE_OF_LONG, SIZE_OF_LONG*2);
              lastOffset = (((long)readBuffer[0] << 56) +
                         ((long)(readBuffer[1] & 255) << 48) +
                         ((long)(readBuffer[2] & 255) << 40) +
                         ((long)(readBuffer[3] & 255) << 32) +
                         ((long)(readBuffer[4] & 255) << 24) +
                         ((readBuffer[5] & 255) << 16) +
                         ((readBuffer[6] & 255) <<  8) +
                         ((readBuffer[7] & 255) <<  0));
              final long tmpLong = (((long)readBuffer[8+0] << 56) +
                        ((long)(readBuffer[8+1] & 255) << 48) +
                        ((long)(readBuffer[8+2] & 255) << 40) +
                        ((long)(readBuffer[8+3] & 255) << 32) +
                        ((long)(readBuffer[8+4] & 255) << 24) +
                        ((readBuffer[8+5] & 255) << 16) +
                        ((readBuffer[8+6] & 255) <<  8) +
                        ((readBuffer[8+7] & 255) <<  0));
              lastLength = (int)(tmpLong - lastOffset);
          }
                   
          lastDocid = docid;
        }

        public void close() throws IOException
        {
          b.close();
        }
    }
 
  static class BinarySearchForwardIndex implements OrderedMap<Text, IntWritable>
  {
    int numberOfEntries = 0;
    MetaIndex meta;
    int itemIndex = 0;
   
    public BinarySearchForwardIndex(MetaIndex _meta, int _numberOfEntries, int _itemIndex)
    {
      meta = _meta;
      numberOfEntries = _numberOfEntries;
      itemIndex = _itemIndex;
    }

    public IntWritable get(Object _key) {
     
      int[] bounds = new int[]{0, numberOfEntries};
        int low = bounds[0];
      int high = bounds[1];
     
      int i;
      int compareEntry;
     
      String key = ((Text)_key).toString();
      //Text testKey = new Text();
      IntWritable value = new IntWritable()
     
      try{
     
        while (low <= high) {
            i = (low + high) >>> 1;
                    String[] parts = meta.getAllItems(i);
                 
                  if ((compareEntry = parts[itemIndex].compareTo(key))< 0)
                    low = i + 1;
                  else if (compareEntry > 0)
                    high = i - 1;
                  else
                  {
                      //return the data
                    value.set(i);
                    return value;
                  }
              }
         
              if (high == numberOfEntries)
                  return null;
             
                       if (high == 0) {
                i = 0;
              } else {
                  i = high;
              }
              String[] parts = meta.getAllItems(i);
             
         
              if (key.compareTo(parts[itemIndex]) == 0) {
                value.set(i);
                  return value;
              }
      } catch (IOException ioe) {
        logger.error("IOException reading FSOrderedMapFile", ioe);
      }
      return null;
    }
   
    public java.util.Map.Entry<Text, IntWritable> get(int index) {
      throw new UnsupportedOperationException("");
    }
   
    public boolean containsKey(Object key) {
      return get(key) != null;
    }
   
    public int size() {
      return numberOfEntries;
    }

    public void clear() {
      throw new UnsupportedOperationException("");
    }   

    public boolean containsValue(Object value) {
      throw new UnsupportedOperationException("");
    }

    public Set<java.util.Map.Entry<Text, IntWritable>> entrySet() {
      throw new UnsupportedOperationException("");
    }   

    public boolean isEmpty() {
      return false;
    }

    public Set<Text> keySet() {
      throw new UnsupportedOperationException("");
    }

    public Integer put(String key, IntWritable value) {
      throw new UnsupportedOperationException("");
    }

    public void putAll(Map<? extends Text, ? extends IntWritable> t) {
      throw new UnsupportedOperationException("");
    }

    public IntWritable remove(Object key) {
      throw new UnsupportedOperationException("");
    }

    public Collection<IntWritable> values() {
      throw new UnsupportedOperationException("");
    }

    public IntWritable put(Text key, IntWritable value) {
      throw new UnsupportedOperationException("");
   
  }
  /** An iterator for reading a MetaIndex as a stream */
  public static class InputStream implements Iterator<String[]>, java.io.Closeable
  {
    final DataInputStream zdata;
    final DataInputStream idx;
    final protected int compressionLevel;
    final protected int recordLength;
   
    protected Inflater inflater;
   
    protected int keyCount;
    protected int[] keyByteOffset;
    protected int[] valueByteLengths;
    //private int[] valueCharLengths;
   
    final int numberOfRecords;
    final int lastId;
    int index=0;
   
    //String[] metaValues;
   
    protected long lastOffset;
    protected long fileLength;
    /**
     * Constructs an instance of the class with
     * @param _index
     * @param _structureName
     * @param _startingId
     * @param _endId
     * @throws IOException
     */
    public InputStream(Index _index, String _structureName, int _startingId, int _endId) throws IOException
    {
      final String dataFilename = _index.getPath() + ApplicationSetup.FILE_SEPARATOR + _index.getPrefix() + "." + _structureName + ".zdata";
      final String indxFilename = _index.getPath() + ApplicationSetup.FILE_SEPARATOR + _index.getPrefix() + "." + _structureName + ".idx";
      zdata = new DataInputStream(Files.openFileStream(dataFilename));
      idx = new DataInputStream(Files.openFileStream(indxFilename));
      fileLength = Files.length(dataFilename);
     
      //1. int - how much zlib was used
      compressionLevel = _index.getIntIndexProperty("index."+_structureName+".compression-level", 5);
      //2. int - how big each record was before compression
      //recordLength = _index.getIntIndexProperty("index."+_structureName+".entry-length", 0);
      //TR-167: recordLength is counted as characters instead of bytes in Terrier 3.0, and hence is inaccurate.
      //obtain from value character lengths instead
     
      //3. key names
      //keyNames = index.getIndexProperty("index."+_structureName+".key-names", "").split("\\s*,\\s*");
      //4. lengths of each key
      String[] _tmpValueLengths = _index.getIndexProperty("index."+_structureName+".value-lengths", "").split("\\s*,\\s*");
      int i=0;
      valueByteLengths = new int[_tmpValueLengths.length];
      int _recordLength = 0;
      for(String lens : _tmpValueLengths)
      {
        valueByteLengths[i] = FixedSizeTextFactory.getMaximumTextLength(Integer.parseInt(lens));
        _recordLength += valueByteLengths[i];
        i++;
      }
      recordLength = _recordLength;
      keyCount = valueByteLengths.length;
     
      //5. offsets in file
      lastId = _endId;
      numberOfRecords = _index.getIntIndexProperty("index."+_structureName+".entries", 0);
           
      inflater = inflaterCache.get();
      index = _startingId -1;
      long targetSkipped = (long)_startingId  * (long)8;
      long actualSkipped = 0;
      //skip to appropriate place in index file
      while(actualSkipped < targetSkipped)
      {
        actualSkipped += idx.skip(targetSkipped - actualSkipped);
      }
      lastOffset = idx.readLong();
      //now skip forward in data file also
      if (lastOffset > 0)
      {
        long actualSkippedData = 0;
        while(actualSkippedData < lastOffset)
        {
          actualSkippedData += zdata.skip(lastOffset - actualSkippedData);
        }
      }
      keyByteOffset = new int[keyCount];
      int cumulativeOffset = 0;
      for(i=0;i<keyCount;i++)
      {
        //key2length.put(keyNames[i], keyLengths[i]);
        //key2offset.put(keyNames[i], cumulativeOffset);
        keyByteOffset[i] = cumulativeOffset;
        cumulativeOffset += valueByteLengths[i];
      }
    }
    /**
     * Constructs an instance of the class with
     * @param _index
     * @param structureName
     * @throws IOException
     */
    public InputStream(Index _index, String structureName) throws IOException
    {
      this(_index, structureName, 0, -1 + _index.getIntIndexProperty("index."+structureName+".entries", 0));
    }
    /**
     * {@inheritDoc}
     */
    public boolean hasNext() {
      ////logger.info("Checking that docid "+ index + " not greater than "+ lastId);
      return index < lastId;     
    }
    /** Return the position that we are at (entry number) */
    public int getIndex()
    {
      return index;
    }
    /**
     * {@inheritDoc}
     */
    public String[] next() {
      index++;
      long endOffset = -1;
      long startOffset = -1;
      try
     
        ////logger.info("Checking for index "+ (index+1) + " < last possible id " + numberOfRecords);
        endOffset = index < (numberOfRecords-1)
          ? idx.readLong() -1
          : fileLength-1;
        startOffset = lastOffset;
        final int dataLength = (int)(endOffset - lastOffset + 1);
        ////logger.info("Reading zdata file docid="+index+" start=" + lastOffset + " end="+endOffset + " length="+dataLength);
        byte[] b = new byte[dataLength];
        zdata.readFully(b);
        lastOffset = endOffset +1;
        inflater.reset();
        inflater.setInput(b);
        byte[] bOut = new byte[recordLength];
        inflater.inflate(bOut);
        String[] sOut = new String[keyCount];
            for(int i=0;i<keyCount;i++)
            {
                sOut[i] = Text.decode(
                    bOut,
                    keyByteOffset[i],
                    valueByteLengths[i]).trim();
            }
            ////logger.info("Got entry " + Arrays.deepToString(sOut));
            return sOut;
      } catch (Exception ioe) {
        logger.error("Problem reading MetaIndex as a stream. index="+ index + " start="+startOffset+" endOffset="+endOffset, ioe);
        return null;
      }
    }
    /**
     * {@inheritDoc}
     */
    public void remove() {
      throw new UnsupportedOperationException();
    }
    /**
     * {@inheritDoc}
     */
    public void close() throws IOException
    {
      zdata.close();
      idx.close();
    }
   
  }
 
  protected Docid2OffsetLookup offsetLookup;
 
  //protected long[] docid2offsets;
  protected int compressionLevel;
  protected int recordLength;
  //protected long fileLength;
 
  //protected int EntryLength;

  protected String[] keyNames;
  protected TObjectIntHashMap<String> key2byteoffset;
  protected TObjectIntHashMap<String> key2bytelength;
 
  protected TObjectIntHashMap<String> key2forwardOffset;
 
  protected int keyCount;
  protected int[] valueByteOffsets;
  protected int[] valueByteLengths;
 
  protected final String path;
  protected final String prefix;
 
  protected final ByteAccessor dataSource;
  protected Map<Text,IntWritable>[] forwardMetaMaps;
  protected FixedSizeWriteableFactory<Text>[] keyFactories;
 
  /**
   * Construct an instance of the class with
   * @param index
   * @param structureName
   * @throws IOException
   */
  public CompressingMetaIndex(Index index, String structureName)
    throws IOException
  {
    this.path = index.getPath(); this.prefix = index.getPrefix();
    loadIndex(index, structureName);
    final String dataFilename =
      path + ApplicationSetup.FILE_SEPARATOR + prefix + "."+structureName+".zdata";
    long dataFileLength = Files.length(dataFilename);

    String fileSource = index.getIndexProperty("index."+structureName + ".data-source", "fileinmem");
    ByteAccessor _dataSource = null;
    if (fileSource.equals("fileinmem"))
    {
      //logger.info("Structure "+ structureName + " loading data file into memory");
      try{
        logger.debug("Caching metadata file "+ dataFilename + " to memory");
        final DataInputStream di = new DataInputStream(Files.openFileStream(dataFilename));
        _dataSource = new RandomDataInputAccessor(new RandomDataInputMemory(di, dataFileLength));
      } catch (OutOfMemoryError oome) {
        //logger.warn("OutOfMemoryError: Structure "+ structureName + " reading data file directly from disk");
        //logger.debug("Metadata will be read directly from disk");
        RandomDataInput rfi = Files.openFileRandom(dataFilename);
        _dataSource = (rfi instanceof RandomAccessFile)
          ? new ChannelByteAccessor((RandomAccessFile)rfi)
          : new RandomDataInputAccessor(rfi);
      }
      dataSource = _dataSource;
    }
    else if (fileSource.equals("file"))
    {
      //logger.warn("Structure "+ structureName + " reading data file directly from disk (SLOW)");
      //logger.debug("Metadata will be read directly from disk");
      RandomDataInput rfi = Files.openFileRandom(dataFilename);
      dataSource = (rfi instanceof RandomAccessFile)
        ? new ChannelByteAccessor((RandomAccessFile)rfi)
        : new RandomDataInputAccessor(rfi);
    }
    else
    {
      throw new IOException(
        "Bad property value for index."+structureName + ".source="+fileSource);
    }
  }
  /**
   * {@inheritDoc}
   */
  public String[] getKeys()
  {
    return this.keyNames;
  }
 
  /** Closes the underlying structures.*/
  public void close() throws IOException {
    dataSource.close();
    offsetLookup.close();
    for (Map<Text,IntWritable> m : forwardMetaMaps)
    {
      IndexUtil.close(m);
    }
  }
 
  /** {@inheritDoc} */
  public int getDocument(String key, String value) throws IOException {
    final int forwardId = key2forwardOffset.get(key) -1;
    if (forwardId == -1)
      throw new NoSuchElementException("No reverse lookup for key " + key + " is supported");
    final Text wKey = keyFactories[forwardId].newInstance();
    wKey.set(value);
    assert forwardMetaMaps[forwardId].size() > 0;
    final IntWritable rtr = forwardMetaMaps[forwardId].get(wKey);   
    if (rtr == null)
      return -1;
    return rtr.get();
  }
 
  /** {@inheritDoc}.
   * In this implementation, _docids are sorted to improve disk cache hits.
   *  _docids is however unchanged.
   */
  public String[] getItems(String Key, int[] _docids) throws IOException {
    final int numDocs = _docids.length;
    final int[] docids = new int[numDocs];
    System.arraycopy(_docids, 0, docids, 0, numDocs);
    final String values[] = new String[numDocs];
    //optimisation: order by docid, to improve disk cache hit rate
    final int[] order = new int[numDocs];
    for(int i=0;i<numDocs;i++)
      order[i] = i;
    HeapSortInt.ascendingHeapSort(docids, order);
   
    for(int i=0;i<numDocs;i++)
    {
      values[order[i]] = getItem(Key, docids[i]);
    }
    return values;
  }

  /** {@inheritDoc}
   *  In this implementation, _docids are sorted to improve disk cache hits.
   *  _docids is however unchanged. */
  public String[][] getItems(String Keys[], final int[] _docids) throws IOException {
    final int numDocs = _docids.length;
    final int[] docids = new int[numDocs];
    System.arraycopy(_docids, 0, docids, 0, numDocs);
    final String[][] saOut = new String[numDocs][];
   
    //optimisation: order by docid, to improve disk cache hit rate
    final int[] order = new int[numDocs];
    for(int i=0;i<numDocs;i++)
      order[i] = i;
    HeapSortInt.ascendingHeapSort(docids, order);
   
    for(int i=0;i<numDocs;i++)
    {
      saOut[order[i]] = getItems(Keys, docids[i]);
    }
    return saOut;
  }

  /** {@inheritDoc} */ 
  public String getItem(String Key, int docid)
        throws IOException
    {
    Inflater unzip = inflaterCache.get();
    unzip.reset();
    unzip.setInput(dataSource.read(
      offsetLookup.getOffset(docid), offsetLookup.getLength(docid)
      ));
   
    byte[] bOut = new byte[recordLength];
    try {
      unzip.inflate(bOut);
    } catch(DataFormatException dfe) {
      logger.error(dfe);
    }
    return Text.decode(bOut, key2byteoffset.get(Key), key2bytelength.get(Key)).trim();
    }
 
  /** {@inheritDoc} */
  public String[] getItems(String[] Keys, int docid) throws IOException {
    Inflater unzip = inflaterCache.get();
    unzip.reset();
    unzip.setInput(dataSource.read(
        offsetLookup.getOffset(docid), offsetLookup.getLength(docid)
        ));
    byte[] bOut = new byte[recordLength];
    try {
      unzip.inflate(bOut);
    } catch(DataFormatException dfe) {
      logger.error(dfe);
    }
        final int kCount = Keys.length;
        String[] sOut = new String[kCount];
        for(int i=0;i<kCount;i++)
        {
            sOut[i] = Text.decode(
                bOut,
                key2byteoffset.get(Keys[i]),
                key2bytelength.get(Keys[i])).trim();
        }
        return sOut;
    }
 
  /** {@inheritDoc} */
  public String[] getAllItems(int docid) throws IOException {
    Inflater unzip = inflaterCache.get();
    unzip.reset();
    unzip.setInput(dataSource.read(
        offsetLookup.getOffset(docid), offsetLookup.getLength(docid)
        ));
    //unzip.setInput(
    //    dataSource.read(docid2offsets[docid],
    //        (docid+1)==docid2offsets.length ? (int)(fileLength-docid2offsets[docid])
    //                                        : (int)(docid2offsets[docid+1] - docid2offsets[docid])));
    byte[] bOut = new byte[recordLength];
    try {
      unzip.inflate(bOut);
    } catch(DataFormatException dfe) {
      logger.error(dfe);
    }
        final int kCount = this.keyCount;
        String[] sOut = new String[kCount];
       
        for(int i=0;i<kCount;i++)
        {
            sOut[i] = Text.decode(
                bOut,
                valueByteOffsets[i],
                valueByteLengths[i]).trim();
        }
        return sOut;
  }

  @SuppressWarnings("unchecked")
  protected void loadIndex(Index index, String structureName) throws IOException {
      
    //1. int - how much zlib was used
    compressionLevel = index.getIntIndexProperty("index."+structureName+".compression-level", 5);
    //2. int - how big each record was before compression
    //recordLength = index.getIntIndexProperty("index."+structureName+".entry-length", 0);
    //TR-167: recordLength is counted as characters instead of bytes in Terrier 3.0, and hence is inaccurate.
    //obtain from value character lengths instead
   
    //3. key names
    keyNames = index.getIndexProperty("index."+structureName+".key-names", "").split("\\s*,\\s*");
    //4. lengths of each key
    String[] _tmpValueLengths = index.getIndexProperty("index."+structureName+".value-lengths", "").split("\\s*,\\s*");
    int i=0;
    valueByteLengths = new int[_tmpValueLengths.length];
    int[] valueCharLengths = new int[_tmpValueLengths.length];
    recordLength = 0;
    for(String lens : _tmpValueLengths)
    {
      valueCharLengths[i] = Integer.parseInt(lens);
      valueByteLengths[i] = FixedSizeTextFactory.getMaximumTextLength(valueCharLengths[i]);
      recordLength += valueByteLengths[i];
      i++;
    }
   
    //5. (long[]) length (numDocs+1) - offsets in file
    final int length = index.getIntIndexProperty("index."+structureName+".entries", 0);
   
    String indexFilename = path+ApplicationSetup.FILE_SEPARATOR+prefix+"."+structureName+".idx";
    String dataFilename = path+ApplicationSetup.FILE_SEPARATOR+prefix+"."+structureName+".zdata";
    String indexSource = index.getIndexProperty("index."+structureName + ".index-source", "fileinmem");
    long indexFileLength = Files.length(indexFilename);
    long dataFileLength = Files.length(dataFilename);
   
    if (indexSource.equals("fileinmem"))
    {
      //logger.info("Structure "+ structureName + " reading lookup file into memory");
      if (indexFileLength < Integer.MAX_VALUE)
      { 
        try{
          DataInputStream dis = new DataInputStream(Files.openFileStream(indexFilename));
          final long[] docid2offsets = new long[length];
          for(i=0;i<length;i++)
            docid2offsets[i] = dis.readLong();
          logger.debug("docid2offsets.length: " + docid2offsets.length + " ZIP_COMPRESSION_LEVEL: " + compressionLevel + " recordLength: " + recordLength);
          offsetLookup = new ArrayDocid2OffsetLookup(docid2offsets, dataFileLength);
          //finished with index file
          dis.close();
        } catch (OutOfMemoryError oome) {
          //logger.warn("OutOfMemoryError: Structure "+ structureName + " reading lookup file directly from disk");
          //logger.debug("Metadata lookup will be read directly from disk: "+ length +" entries, size "+ dataFileLength + " bytes");
          RandomDataInput rfi = Files.openFileRandom(indexFilename);
          offsetLookup = new OnDiskDocid2OffsetLookup(
            rfi instanceof RandomAccessFile
              ? new ChannelByteAccessor((RandomAccessFile)rfi)
              : new RandomDataInputAccessor(rfi),
            length, dataFileLength
            );
        }
      }
      else
      {
        try{
          DataInputStream dis = new DataInputStream(Files.openFileStream(indexFilename));
          offsetLookup = new OnDiskDocid2OffsetLookup(new RandomDataInputAccessor(new RandomDataInputMemory(dis, indexFileLength)),length, dataFileLength);
          dis.close();
        }
        catch (OutOfMemoryError oome) {
          //logger.warn("OutOfMemoryError: Structure "+ structureName + " reading lookup file directly from disk");
          //logger.debug("Metadata lookup will be read directly from disk: "+ length +" entries, size "+ dataFileLength + " bytes");
          RandomDataInput rfi = Files.openFileRandom(indexFilename);
          offsetLookup = new OnDiskDocid2OffsetLookup(
            rfi instanceof RandomAccessFile
              ? new ChannelByteAccessor((RandomAccessFile)rfi)
              : new RandomDataInputAccessor(rfi),
            length, dataFileLength
            );
        }
      } 
    } else {
      //logger.warn("Structure "+ structureName + " reading lookup file directly from disk (SLOW)");
      //logger.debug("Metadata lookup will be read directly from disk: "+ length +" entries, size "+ dataFileLength + " bytes");
      RandomDataInput rfi = Files.openFileRandom(indexFilename);
      offsetLookup = new OnDiskDocid2OffsetLookup(
        rfi instanceof RandomAccessFile
          ? new ChannelByteAccessor((RandomAccessFile)rfi)
          : new RandomDataInputAccessor(rfi),
        length, dataFileLength
        );
    }
    //debug log lookups using a wrapper class
    if (logger.isDebugEnabled())
      offsetLookup = new LoggingDocid2OffsetLookup(offsetLookup);
     
   
   

    //now build the keyname and lengths into 2 maps:
    // keyname -> length & keyname -> offsets
    keyCount = keyNames.length;
    key2bytelength = new TObjectIntHashMap<String>(keyCount);
    TObjectIntHashMap<String> key2stringlength = new TObjectIntHashMap<String>(keyCount);
    key2byteoffset = new TObjectIntHashMap<String>(keyCount);
    valueByteOffsets = new int[keyCount];
    int cumulativeOffset = 0;
    for(i=0;i<keyCount;i++)
    {
      key2stringlength.put(keyNames[i], valueCharLengths[i]);
      key2bytelength.put(keyNames[i], valueByteLengths[i]);
      key2byteoffset.put(keyNames[i], cumulativeOffset);
      valueByteOffsets[i] = cumulativeOffset;
      cumulativeOffset += valueByteLengths[i];
    }
   
    key2forwardOffset = new TObjectIntHashMap<String>(2);
    final String[] forwardKeys = index.getIndexProperty("index."+structureName+".reverse-key-names", "").split("\\s*,\\s*");
    forwardMetaMaps = (Map<Text,IntWritable>[])new Map[forwardKeys.length];
    keyFactories = (FixedSizeWriteableFactory<Text>[])new FixedSizeWriteableFactory[forwardKeys.length];
    i=0;
    final FixedSizeIntWritableFactory valueFactory = new FixedSizeIntWritableFactory();
    for(String keyName : forwardKeys)
    {
      if (keyName.trim().equals(""))
        continue;
      key2forwardOffset.put(keyName, 1+i);
      logger.debug("Forward key "+ keyName +", length="+ key2bytelength.get(keyName));
      keyFactories[i] = new FixedSizeTextFactory(key2stringlength.get(keyName));
      String filename = path+ApplicationSetup.FILE_SEPARATOR+prefix+"."+structureName+"-"+i+FSOrderedMapFile.USUAL_EXTENSION;
      String loadFormat = index.getIndexProperty("index."+structureName+".reverse."+keyName+".in-mem", "false");
      if (loadFormat.equals("hashmap"))
      {
        //logger.info("Structure "+ structureName + " reading reverse map for key "+ keyName + " into memory as hashmap");
        forwardMetaMaps[i] = new FSOrderedMapFile.MapFileInMemory<Text, IntWritable>(
            filename,
            keyFactories[i],
            valueFactory);
      }
      else if (loadFormat.equals("mapfileinmem"))
      {
       
        final long revDataFileLength = Files.length(filename);
        //if (revDataFileLength > Integer.MAX_VALUE)
        //{
        //  loadFormat = "false";
        //  //logger.info("Structure "+ structureName + " reading reverse map for key "+ keyName + " - too big for memory as bytearray");
        //}
        //else
        //{ 
          //logger.info("Structure "+ structureName + " reading reverse map for key "+ keyName + " into memory as bytearray");
          DataInputStream dis = new DataInputStream(Files.openFileStream(filename));
          //final byte[] bytes = new byte[(int)revDataFileLength];
          //dis.readFully(bytes);
          //dis.close();       
          forwardMetaMaps[i] = new FSOrderedMapFile<Text, IntWritable>(
              new RandomDataInputMemory(dis, revDataFileLength),
              filename,
              keyFactories[i],
              valueFactory);
        //}
      }
     
      if (loadFormat.equals("false"))
      { 
        //logger.info("Structure "+ structureName + " reading reverse map for key "+ keyName + " directly from disk");
        forwardMetaMaps[i] = new FSOrderedMapFile<Text, IntWritable>(
            filename,
            false,
            keyFactories[i],
            valueFactory);
      }
      i++;
    }
  }
  /**
   * main
   * @param args
   * @throws Exception
   */
  public static void main(String args[]) throws Exception
  {
    if (args.length == 0)
    {
      System.err.println("Usage: " + CompressingMetaIndex.class.getName() + " {print|printrange min max|get docid|docno} ");
      return;
    }
   
    //load structures that we actually need
    Index.setIndexLoadingProfileAsRetrieval(false);
    Index index = Index.createIndex();
    if (args[0].equals("print"))
    {
      IndexUtil.printMetaIndex(index, "meta");
    }
    else if (args[0].equals("printrange"))
    {
      Iterator<String[]> inputStream = new InputStream(index, "meta", Integer.parseInt(args[1]), Integer.parseInt(args[2]));
      while(inputStream.hasNext())
      {
        System.out.println(Arrays.toString(inputStream.next()));
      }
      IndexUtil.close(inputStream);
    }
    else if (args[0].equals("get"))
    {
      MetaIndex m = index.getMetaIndex()
      int docid = Integer.parseInt(args[1]);
      String[] values = m.getAllItems(docid);
      String[] keys = m.getKeys();
      for(int i=0;i<keys.length;i++)
      {
        System.out.println(keys[i] + "=" + values[i]);
      }
    }
    else
    {
      MetaIndex m = index.getMetaIndex();     
      int docid = m.getDocument("docno", args[0]);
      System.out.println(args[0] + " -> " + docid);
      String value = m.getItem("docno", docid);
      System.out.println(docid + " -> " + value);
      System.out.println("Equals check: " + value.equals(args[0]));
    }
  }
 
}
TOP

Related Classes of org.terrier.structures.CompressingMetaIndex$LoggingDocid2OffsetLookup

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.